import speech_recognition as sr
import os
import IPython.display as ipd
import pandas as pd
normal_path = ('sample-audios/Normal')
normal_sample_data = [os.path.join(dp, f) for dp, dn, filenames in os.walk(normal_path) for f in filenames if os.path.splitext(f)[1] == '.wav']
normal_sample_data
['sample-audios/Normal\\Normal (95).wav', 'sample-audios/Normal\\Normal (96).wav', 'sample-audios/Normal\\Normal (97).wav', 'sample-audios/Normal\\Normal (98).wav', 'sample-audios/Normal\\Normal (99).wav']
toxic_path = ('sample-audios/Toxic')
toxic_sample_data = [os.path.join(dp, f) for dp, dn, filenames in os.walk(toxic_path) for f in filenames if os.path.splitext(f)[1] == '.wav']
toxic_sample_data
['sample-audios/Toxic\\Toxic (95).wav', 'sample-audios/Toxic\\Toxic (96).wav', 'sample-audios/Toxic\\Toxic (97).wav', 'sample-audios/Toxic\\Toxic (98).wav', 'sample-audios/Toxic\\Toxic (99).wav']
#Transcribing sample Urdu Audio Toxic
#====================================
transcription_record=[]
recognizer = sr.Recognizer()
recognizer.energy_threshold = 300
for i in range(len(toxic_sample_data)):
print("Type: \n\tToxic\n")
ipd.display(ipd.Audio(toxic_sample_data[i]))
#Read Audio File
sample_file = sr.AudioFile(f'{toxic_sample_data[i]}')
#Convert AudioFile to AudioData
with sample_file as source:
#record the audio
sample_audio = recognizer.record(source)
#Transcribing urduSpeech_audio
transcription = recognizer.recognize_google(audio_data=sample_audio, language="en-US")
print(toxic_sample_data[i] + ',' + transcription)
transcription_record.append(transcription)
Type: Toxic
sample-audios/Toxic\Toxic (95).wav,kya bolun Main Nahin matlab kya bulwana kya chahte ho bhai Banda Salam Kar Raha Hai Dua de raha hai koi return hi nahin a Raha Nahin to Fir kya bole Banda Koi Awaaz hi nahin a Rahi tumhara mobile mobile ko bhai Type: Toxic
sample-audios/Toxic\Toxic (96).wav,Tumne to Mujhe Pagal samjha hai ki main tumhari baat hai main a Jaunga Type: Toxic
sample-audios/Toxic\Toxic (97).wav,Kasur Tera nahin tum University jagna mummy daddy baccha ban gaya Type: Toxic
sample-audios/Toxic\Toxic (98).wav,mujhe nahin Pata Tha ki tum meri feelings ke sath Is Tarah ko Khel Ho Gaya laddu Mein Puche Tumhe Barbad Karega idiot Type: Toxic
sample-audios/Toxic\Toxic (99).wav,Rajasthan Rajasthan to bada Calcutta Jayegi Deepak Ke Liye Koi tamiz Koi real Koi Aaya hi nahi Naat
column_names = ["Transcription"]
Transcription_file_toxic = pd.DataFrame(transcription_record, columns = column_names)
Transcription_file_toxic["Type"]="Toxic"
Transcription_file_toxic
| Transcription | Type | |
|---|---|---|
| 0 | kya bolun Main Nahin matlab kya bulwana kya ch... | Toxic |
| 1 | Tumne to Mujhe Pagal samjha hai ki main tumhar... | Toxic |
| 2 | Kasur Tera nahin tum University jagna mummy da... | Toxic |
| 3 | mujhe nahin Pata Tha ki tum meri feelings ke s... | Toxic |
| 4 | Rajasthan Rajasthan to bada Calcutta Jayegi De... | Toxic |
#Transcribing sample Urdu Audio Toxic
#====================================
transcription_record=[]
for i in range(len(normal_sample_data)):
print("Type: \n\tNormal\n")
ipd.display(ipd.Audio(normal_sample_data[i]))
#Read Audio File
sample_file = sr.AudioFile(f'{normal_sample_data[i]}')
#Convert AudioFile to AudioData
with sample_file as source:
#record the audio
sample_audio = recognizer.record(source)
#Transcribing urduSpeech_audio
transcription = recognizer.recognize_google(audio_data=sample_audio, language="en-US")
print(normal_sample_data[i] + ',' + transcription)
transcription_record.append(transcription)
Type: Normal
sample-audios/Normal\Normal (95).wav,Mujhe include Na Karo Aisa Kuchh Bhi Nahin Hai Main Nahin aaungi Type: Normal
sample-audios/Normal\Normal (96).wav,annual party to December Mein Nahin Hoti abhi to November chal raha hai December 14 15 16 date Mein hoti hai Type: Normal
sample-audios/Normal\Normal (97).wav,aap to participate kar Type: Normal
sample-audios/Normal\Normal (98).wav,Maine pahle message Nahin pahle Mujhe yahan se batao like Nind a rahi hai Type: Normal
sample-audios/Normal\Normal (99).wav,yah chal kya raha hai
column_names = ["Transcription"]
Transcription_file_normal = pd.DataFrame(transcription_record, columns = column_names)
Transcription_file_normal ["Type"]="Normal"
Transcription_file_normal
| Transcription | Type | |
|---|---|---|
| 0 | Mujhe include Na Karo Aisa Kuchh Bhi Nahin Hai... | Normal |
| 1 | annual party to December Mein Nahin Hoti abhi ... | Normal |
| 2 | aap to participate kar | Normal |
| 3 | Maine pahle message Nahin pahle Mujhe yahan se... | Normal |
| 4 | yah chal kya raha hai | Normal |
frames = [Transcription_file_toxic,Transcription_file_normal]
result = pd.concat(frames)
result.to_csv(r'transcripted_data.csv', index = False, header=True)
result
| Transcription | Type | |
|---|---|---|
| 0 | kya bolun Main Nahin matlab kya bulwana kya ch... | Toxic |
| 1 | Tumne to Mujhe Pagal samjha hai ki main tumhar... | Toxic |
| 2 | Kasur Tera nahin tum University jagna mummy da... | Toxic |
| 3 | mujhe nahin Pata Tha ki tum meri feelings ke s... | Toxic |
| 4 | Rajasthan Rajasthan to bada Calcutta Jayegi De... | Toxic |
| 0 | Mujhe include Na Karo Aisa Kuchh Bhi Nahin Hai... | Normal |
| 1 | annual party to December Mein Nahin Hoti abhi ... | Normal |
| 2 | aap to participate kar | Normal |
| 3 | Maine pahle message Nahin pahle Mujhe yahan se... | Normal |
| 4 | yah chal kya raha hai | Normal |